The following data set is a breast cancer data set that has numerous measurements taken from tumor biopsies. The goal of using this data set is to predict using the metrics alone if the biopsy is cancer or not. When continuous variables are available it is often helpful to create a pairs plot of data color coded by the response status (Diagnostis). The first variable is an id number and is not needed.
## id_number diagnosis radius_mean texture_mean
## Min. : 8670 B:357 Min. : 6.981 Min. : 9.71
## 1st Qu.: 869218 M:212 1st Qu.:11.700 1st Qu.:16.17
## Median : 906024 Median :13.370 Median :18.84
## Mean : 30371831 Mean :14.127 Mean :19.29
## 3rd Qu.: 8813129 3rd Qu.:15.780 3rd Qu.:21.80
## Max. :911320502 Max. :28.110 Max. :39.28
## perimeter_mean area_mean smoothness_mean compactness_mean
## Min. : 43.79 Min. : 143.5 Min. :0.05263 Min. :0.01938
## 1st Qu.: 75.17 1st Qu.: 420.3 1st Qu.:0.08637 1st Qu.:0.06492
## Median : 86.24 Median : 551.1 Median :0.09587 Median :0.09263
## Mean : 91.97 Mean : 654.9 Mean :0.09636 Mean :0.10434
## 3rd Qu.:104.10 3rd Qu.: 782.7 3rd Qu.:0.10530 3rd Qu.:0.13040
## Max. :188.50 Max. :2501.0 Max. :0.16340 Max. :0.34540
## concavity_mean concave_points_mean symmetry_mean
## Min. :0.00000 Min. :0.00000 Min. :0.1060
## 1st Qu.:0.02956 1st Qu.:0.02031 1st Qu.:0.1619
## Median :0.06154 Median :0.03350 Median :0.1792
## Mean :0.08880 Mean :0.04892 Mean :0.1812
## 3rd Qu.:0.13070 3rd Qu.:0.07400 3rd Qu.:0.1957
## Max. :0.42680 Max. :0.20120 Max. :0.3040
## fractal_dimension_mean radius_se texture_se perimeter_se
## Min. :0.04996 Min. :0.1115 Min. :0.3602 Min. : 0.757
## 1st Qu.:0.05770 1st Qu.:0.2324 1st Qu.:0.8339 1st Qu.: 1.606
## Median :0.06154 Median :0.3242 Median :1.1080 Median : 2.287
## Mean :0.06280 Mean :0.4052 Mean :1.2169 Mean : 2.866
## 3rd Qu.:0.06612 3rd Qu.:0.4789 3rd Qu.:1.4740 3rd Qu.: 3.357
## Max. :0.09744 Max. :2.8730 Max. :4.8850 Max. :21.980
## area_se smoothness_se compactness_se concavity_se
## Min. : 6.802 Min. :0.001713 Min. :0.002252 Min. :0.00000
## 1st Qu.: 17.850 1st Qu.:0.005169 1st Qu.:0.013080 1st Qu.:0.01509
## Median : 24.530 Median :0.006380 Median :0.020450 Median :0.02589
## Mean : 40.337 Mean :0.007041 Mean :0.025478 Mean :0.03189
## 3rd Qu.: 45.190 3rd Qu.:0.008146 3rd Qu.:0.032450 3rd Qu.:0.04205
## Max. :542.200 Max. :0.031130 Max. :0.135400 Max. :0.39600
## concave_points_se symmetry_se fractal_dimension_se
## Min. :0.000000 Min. :0.007882 Min. :0.0008948
## 1st Qu.:0.007638 1st Qu.:0.015160 1st Qu.:0.0022480
## Median :0.010930 Median :0.018730 Median :0.0031870
## Mean :0.011796 Mean :0.020542 Mean :0.0037949
## 3rd Qu.:0.014710 3rd Qu.:0.023480 3rd Qu.:0.0045580
## Max. :0.052790 Max. :0.078950 Max. :0.0298400
## radius_worst texture_worst perimeter_worst area_worst
## Min. : 7.93 Min. :12.02 Min. : 50.41 Min. : 185.2
## 1st Qu.:13.01 1st Qu.:21.08 1st Qu.: 84.11 1st Qu.: 515.3
## Median :14.97 Median :25.41 Median : 97.66 Median : 686.5
## Mean :16.27 Mean :25.68 Mean :107.26 Mean : 880.6
## 3rd Qu.:18.79 3rd Qu.:29.72 3rd Qu.:125.40 3rd Qu.:1084.0
## Max. :36.04 Max. :49.54 Max. :251.20 Max. :4254.0
## smoothness_worst compactness_worst concavity_worst concave_points_worst
## Min. :0.07117 Min. :0.02729 Min. :0.0000 Min. :0.00000
## 1st Qu.:0.11660 1st Qu.:0.14720 1st Qu.:0.1145 1st Qu.:0.06493
## Median :0.13130 Median :0.21190 Median :0.2267 Median :0.09993
## Mean :0.13237 Mean :0.25427 Mean :0.2722 Mean :0.11461
## 3rd Qu.:0.14600 3rd Qu.:0.33910 3rd Qu.:0.3829 3rd Qu.:0.16140
## Max. :0.22260 Max. :1.05800 Max. :1.2520 Max. :0.29100
## symmetry_worst fractal_dimension_worst
## Min. :0.1565 Min. :0.05504
## 1st Qu.:0.2504 1st Qu.:0.07146
## Median :0.2822 Median :0.08004
## Mean :0.2901 Mean :0.08395
## 3rd Qu.:0.3179 3rd Qu.:0.09208
## Max. :0.6638 Max. :0.20750
## Observations: 569
## Variables: 31
## $ radius_mean <dbl> 17.990, 20.570, 19.690, 11.420, 20.290, …
## $ texture_mean <dbl> 10.38, 17.77, 21.25, 20.38, 14.34, 15.70…
## $ perimeter_mean <dbl> 122.80, 132.90, 130.00, 77.58, 135.10, 8…
## $ area_mean <dbl> 1001.0, 1326.0, 1203.0, 386.1, 1297.0, 4…
## $ smoothness_mean <dbl> 0.11840, 0.08474, 0.10960, 0.14250, 0.10…
## $ compactness_mean <dbl> 0.27760, 0.07864, 0.15990, 0.28390, 0.13…
## $ concavity_mean <dbl> 0.30010, 0.08690, 0.19740, 0.24140, 0.19…
## $ concave_points_mean <dbl> 0.14710, 0.07017, 0.12790, 0.10520, 0.10…
## $ symmetry_mean <dbl> 0.2419, 0.1812, 0.2069, 0.2597, 0.1809, …
## $ fractal_dimension_mean <dbl> 0.07871, 0.05667, 0.05999, 0.09744, 0.05…
## $ radius_se <dbl> 1.0950, 0.5435, 0.7456, 0.4956, 0.7572, …
## $ texture_se <dbl> 0.9053, 0.7339, 0.7869, 1.1560, 0.7813, …
## $ perimeter_se <dbl> 8.589, 3.398, 4.585, 3.445, 5.438, 2.217…
## $ area_se <dbl> 153.40, 74.08, 94.03, 27.23, 94.44, 27.1…
## $ smoothness_se <dbl> 0.006399, 0.005225, 0.006150, 0.009110, …
## $ compactness_se <dbl> 0.049040, 0.013080, 0.040060, 0.074580, …
## $ concavity_se <dbl> 0.05373, 0.01860, 0.03832, 0.05661, 0.05…
## $ concave_points_se <dbl> 0.015870, 0.013400, 0.020580, 0.018670, …
## $ symmetry_se <dbl> 0.03003, 0.01389, 0.02250, 0.05963, 0.01…
## $ fractal_dimension_se <dbl> 0.006193, 0.003532, 0.004571, 0.009208, …
## $ radius_worst <dbl> 25.38, 24.99, 23.57, 14.91, 22.54, 15.47…
## $ texture_worst <dbl> 17.33, 23.41, 25.53, 26.50, 16.67, 23.75…
## $ perimeter_worst <dbl> 184.60, 158.80, 152.50, 98.87, 152.20, 1…
## $ area_worst <dbl> 2019.0, 1956.0, 1709.0, 567.7, 1575.0, 7…
## $ smoothness_worst <dbl> 0.1622, 0.1238, 0.1444, 0.2098, 0.1374, …
## $ compactness_worst <dbl> 0.6656, 0.1866, 0.4245, 0.8663, 0.2050, …
## $ concavity_worst <dbl> 0.71190, 0.24160, 0.45040, 0.68690, 0.40…
## $ concave_points_worst <dbl> 0.26540, 0.18600, 0.24300, 0.25750, 0.16…
## $ symmetry_worst <dbl> 0.4601, 0.2750, 0.3613, 0.6638, 0.2364, …
## $ fractal_dimension_worst <dbl> 0.11890, 0.08902, 0.08758, 0.17300, 0.07…
## $ outcome <fct> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1…
names(BrCanClean)[31] <- "diagnosis"
summary(BrCanClean)
## radius_mean texture_mean perimeter_mean area_mean
## Min. : 6.981 Min. : 9.71 Min. : 43.79 Min. : 143.5
## 1st Qu.:11.700 1st Qu.:16.17 1st Qu.: 75.17 1st Qu.: 420.3
## Median :13.370 Median :18.84 Median : 86.24 Median : 551.1
## Mean :14.127 Mean :19.29 Mean : 91.97 Mean : 654.9
## 3rd Qu.:15.780 3rd Qu.:21.80 3rd Qu.:104.10 3rd Qu.: 782.7
## Max. :28.110 Max. :39.28 Max. :188.50 Max. :2501.0
## smoothness_mean compactness_mean concavity_mean concave_points_mean
## Min. :0.05263 Min. :0.01938 Min. :0.00000 Min. :0.00000
## 1st Qu.:0.08637 1st Qu.:0.06492 1st Qu.:0.02956 1st Qu.:0.02031
## Median :0.09587 Median :0.09263 Median :0.06154 Median :0.03350
## Mean :0.09636 Mean :0.10434 Mean :0.08880 Mean :0.04892
## 3rd Qu.:0.10530 3rd Qu.:0.13040 3rd Qu.:0.13070 3rd Qu.:0.07400
## Max. :0.16340 Max. :0.34540 Max. :0.42680 Max. :0.20120
## symmetry_mean fractal_dimension_mean radius_se texture_se
## Min. :0.1060 Min. :0.04996 Min. :0.1115 Min. :0.3602
## 1st Qu.:0.1619 1st Qu.:0.05770 1st Qu.:0.2324 1st Qu.:0.8339
## Median :0.1792 Median :0.06154 Median :0.3242 Median :1.1080
## Mean :0.1812 Mean :0.06280 Mean :0.4052 Mean :1.2169
## 3rd Qu.:0.1957 3rd Qu.:0.06612 3rd Qu.:0.4789 3rd Qu.:1.4740
## Max. :0.3040 Max. :0.09744 Max. :2.8730 Max. :4.8850
## perimeter_se area_se smoothness_se compactness_se
## Min. : 0.757 Min. : 6.802 Min. :0.001713 Min. :0.002252
## 1st Qu.: 1.606 1st Qu.: 17.850 1st Qu.:0.005169 1st Qu.:0.013080
## Median : 2.287 Median : 24.530 Median :0.006380 Median :0.020450
## Mean : 2.866 Mean : 40.337 Mean :0.007041 Mean :0.025478
## 3rd Qu.: 3.357 3rd Qu.: 45.190 3rd Qu.:0.008146 3rd Qu.:0.032450
## Max. :21.980 Max. :542.200 Max. :0.031130 Max. :0.135400
## concavity_se concave_points_se symmetry_se
## Min. :0.00000 Min. :0.000000 Min. :0.007882
## 1st Qu.:0.01509 1st Qu.:0.007638 1st Qu.:0.015160
## Median :0.02589 Median :0.010930 Median :0.018730
## Mean :0.03189 Mean :0.011796 Mean :0.020542
## 3rd Qu.:0.04205 3rd Qu.:0.014710 3rd Qu.:0.023480
## Max. :0.39600 Max. :0.052790 Max. :0.078950
## fractal_dimension_se radius_worst texture_worst perimeter_worst
## Min. :0.0008948 Min. : 7.93 Min. :12.02 Min. : 50.41
## 1st Qu.:0.0022480 1st Qu.:13.01 1st Qu.:21.08 1st Qu.: 84.11
## Median :0.0031870 Median :14.97 Median :25.41 Median : 97.66
## Mean :0.0037949 Mean :16.27 Mean :25.68 Mean :107.26
## 3rd Qu.:0.0045580 3rd Qu.:18.79 3rd Qu.:29.72 3rd Qu.:125.40
## Max. :0.0298400 Max. :36.04 Max. :49.54 Max. :251.20
## area_worst smoothness_worst compactness_worst concavity_worst
## Min. : 185.2 Min. :0.07117 Min. :0.02729 Min. :0.0000
## 1st Qu.: 515.3 1st Qu.:0.11660 1st Qu.:0.14720 1st Qu.:0.1145
## Median : 686.5 Median :0.13130 Median :0.21190 Median :0.2267
## Mean : 880.6 Mean :0.13237 Mean :0.25427 Mean :0.2722
## 3rd Qu.:1084.0 3rd Qu.:0.14600 3rd Qu.:0.33910 3rd Qu.:0.3829
## Max. :4254.0 Max. :0.22260 Max. :1.05800 Max. :1.2520
## concave_points_worst symmetry_worst fractal_dimension_worst diagnosis
## Min. :0.00000 Min. :0.1565 Min. :0.05504 0:357
## 1st Qu.:0.06493 1st Qu.:0.2504 1st Qu.:0.07146 1:212
## Median :0.09993 Median :0.2822 Median :0.08004
## Mean :0.11461 Mean :0.2901 Mean :0.08395
## 3rd Qu.:0.16140 3rd Qu.:0.3179 3rd Qu.:0.09208
## Max. :0.29100 Max. :0.6638 Max. :0.20750
#Getting a look at the distribution
table(BrCanClean$diagnosis)
##
## 0 1
## 357 212
# Let's check the structure of this dataset.
str(BrCanClean)
## 'data.frame': 569 obs. of 31 variables:
## $ radius_mean : num 18 20.6 19.7 11.4 20.3 ...
## $ texture_mean : num 10.4 17.8 21.2 20.4 14.3 ...
## $ perimeter_mean : num 122.8 132.9 130 77.6 135.1 ...
## $ area_mean : num 1001 1326 1203 386 1297 ...
## $ smoothness_mean : num 0.1184 0.0847 0.1096 0.1425 0.1003 ...
## $ compactness_mean : num 0.2776 0.0786 0.1599 0.2839 0.1328 ...
## $ concavity_mean : num 0.3001 0.0869 0.1974 0.2414 0.198 ...
## $ concave_points_mean : num 0.1471 0.0702 0.1279 0.1052 0.1043 ...
## $ symmetry_mean : num 0.242 0.181 0.207 0.26 0.181 ...
## $ fractal_dimension_mean : num 0.0787 0.0567 0.06 0.0974 0.0588 ...
## $ radius_se : num 1.095 0.543 0.746 0.496 0.757 ...
## $ texture_se : num 0.905 0.734 0.787 1.156 0.781 ...
## $ perimeter_se : num 8.59 3.4 4.58 3.44 5.44 ...
## $ area_se : num 153.4 74.1 94 27.2 94.4 ...
## $ smoothness_se : num 0.0064 0.00522 0.00615 0.00911 0.01149 ...
## $ compactness_se : num 0.049 0.0131 0.0401 0.0746 0.0246 ...
## $ concavity_se : num 0.0537 0.0186 0.0383 0.0566 0.0569 ...
## $ concave_points_se : num 0.0159 0.0134 0.0206 0.0187 0.0188 ...
## $ symmetry_se : num 0.03 0.0139 0.0225 0.0596 0.0176 ...
## $ fractal_dimension_se : num 0.00619 0.00353 0.00457 0.00921 0.00511 ...
## $ radius_worst : num 25.4 25 23.6 14.9 22.5 ...
## $ texture_worst : num 17.3 23.4 25.5 26.5 16.7 ...
## $ perimeter_worst : num 184.6 158.8 152.5 98.9 152.2 ...
## $ area_worst : num 2019 1956 1709 568 1575 ...
## $ smoothness_worst : num 0.162 0.124 0.144 0.21 0.137 ...
## $ compactness_worst : num 0.666 0.187 0.424 0.866 0.205 ...
## $ concavity_worst : num 0.712 0.242 0.45 0.687 0.4 ...
## $ concave_points_worst : num 0.265 0.186 0.243 0.258 0.163 ...
## $ symmetry_worst : num 0.46 0.275 0.361 0.664 0.236 ...
## $ fractal_dimension_worst: num 0.1189 0.089 0.0876 0.173 0.0768 ...
## $ diagnosis : Factor w/ 2 levels "0","1": 2 2 2 2 2 2 2 2 2 2 ...
#Getting a look at the distribution to check the balance of the label
table(BrCanClean$diagnosis)
##
## 0 1
## 357 212
plot(BrCanClean$diagnosis)
Cancer <- c(357,212)
Lbls <- c("Benign", "Malignant")
pie(Cancer, labels = Lbls, main="Breast Cancer Class Distribution") #### ---------------------># plot to use in report
## radius_mean texture_mean perimeter_mean area_mean smoothness_mean
## 1 17.99 10.38 122.80 1001.0 0.11840
## 2 20.57 17.77 132.90 1326.0 0.08474
## 3 19.69 21.25 130.00 1203.0 0.10960
## 4 11.42 20.38 77.58 386.1 0.14250
## 5 20.29 14.34 135.10 1297.0 0.10030
## 6 12.45 15.70 82.57 477.1 0.12780
## compactness_mean concavity_mean concave_points_mean symmetry_mean
## 1 0.27760 0.3001 0.14710 0.2419
## 2 0.07864 0.0869 0.07017 0.1812
## 3 0.15990 0.1974 0.12790 0.2069
## 4 0.28390 0.2414 0.10520 0.2597
## 5 0.13280 0.1980 0.10430 0.1809
## 6 0.17000 0.1578 0.08089 0.2087
## fractal_dimension_mean radius_se texture_se perimeter_se area_se
## 1 0.07871 1.0950 0.9053 8.589 153.40
## 2 0.05667 0.5435 0.7339 3.398 74.08
## 3 0.05999 0.7456 0.7869 4.585 94.03
## 4 0.09744 0.4956 1.1560 3.445 27.23
## 5 0.05883 0.7572 0.7813 5.438 94.44
## 6 0.07613 0.3345 0.8902 2.217 27.19
## smoothness_se compactness_se concavity_se concave_points_se symmetry_se
## 1 0.006399 0.04904 0.05373 0.01587 0.03003
## 2 0.005225 0.01308 0.01860 0.01340 0.01389
## 3 0.006150 0.04006 0.03832 0.02058 0.02250
## 4 0.009110 0.07458 0.05661 0.01867 0.05963
## 5 0.011490 0.02461 0.05688 0.01885 0.01756
## 6 0.007510 0.03345 0.03672 0.01137 0.02165
## fractal_dimension_se radius_worst texture_worst perimeter_worst
## 1 0.006193 25.38 17.33 184.60
## 2 0.003532 24.99 23.41 158.80
## 3 0.004571 23.57 25.53 152.50
## 4 0.009208 14.91 26.50 98.87
## 5 0.005115 22.54 16.67 152.20
## 6 0.005082 15.47 23.75 103.40
## area_worst smoothness_worst compactness_worst concavity_worst
## 1 2019.0 0.1622 0.6656 0.7119
## 2 1956.0 0.1238 0.1866 0.2416
## 3 1709.0 0.1444 0.4245 0.4504
## 4 567.7 0.2098 0.8663 0.6869
## 5 1575.0 0.1374 0.2050 0.4000
## 6 741.6 0.1791 0.5249 0.5355
## concave_points_worst symmetry_worst fractal_dimension_worst diagnosis
## 1 0.2654 0.4601 0.11890 1
## 2 0.1860 0.2750 0.08902 1
## 3 0.2430 0.3613 0.08758 1
## 4 0.2575 0.6638 0.17300 1
## 5 0.1625 0.2364 0.07678 1
## 6 0.1741 0.3985 0.12440 1
## [1] 569 31
## 'data.frame': 569 obs. of 31 variables:
## $ radius_mean : num 18 20.6 19.7 11.4 20.3 ...
## $ texture_mean : num 10.4 17.8 21.2 20.4 14.3 ...
## $ perimeter_mean : num 122.8 132.9 130 77.6 135.1 ...
## $ area_mean : num 1001 1326 1203 386 1297 ...
## $ smoothness_mean : num 0.1184 0.0847 0.1096 0.1425 0.1003 ...
## $ compactness_mean : num 0.2776 0.0786 0.1599 0.2839 0.1328 ...
## $ concavity_mean : num 0.3001 0.0869 0.1974 0.2414 0.198 ...
## $ concave_points_mean : num 0.1471 0.0702 0.1279 0.1052 0.1043 ...
## $ symmetry_mean : num 0.242 0.181 0.207 0.26 0.181 ...
## $ fractal_dimension_mean : num 0.0787 0.0567 0.06 0.0974 0.0588 ...
## $ radius_se : num 1.095 0.543 0.746 0.496 0.757 ...
## $ texture_se : num 0.905 0.734 0.787 1.156 0.781 ...
## $ perimeter_se : num 8.59 3.4 4.58 3.44 5.44 ...
## $ area_se : num 153.4 74.1 94 27.2 94.4 ...
## $ smoothness_se : num 0.0064 0.00522 0.00615 0.00911 0.01149 ...
## $ compactness_se : num 0.049 0.0131 0.0401 0.0746 0.0246 ...
## $ concavity_se : num 0.0537 0.0186 0.0383 0.0566 0.0569 ...
## $ concave_points_se : num 0.0159 0.0134 0.0206 0.0187 0.0188 ...
## $ symmetry_se : num 0.03 0.0139 0.0225 0.0596 0.0176 ...
## $ fractal_dimension_se : num 0.00619 0.00353 0.00457 0.00921 0.00511 ...
## $ radius_worst : num 25.4 25 23.6 14.9 22.5 ...
## $ texture_worst : num 17.3 23.4 25.5 26.5 16.7 ...
## $ perimeter_worst : num 184.6 158.8 152.5 98.9 152.2 ...
## $ area_worst : num 2019 1956 1709 568 1575 ...
## $ smoothness_worst : num 0.162 0.124 0.144 0.21 0.137 ...
## $ compactness_worst : num 0.666 0.187 0.424 0.866 0.205 ...
## $ concavity_worst : num 0.712 0.242 0.45 0.687 0.4 ...
## $ concave_points_worst : num 0.265 0.186 0.243 0.258 0.163 ...
## $ symmetry_worst : num 0.46 0.275 0.361 0.664 0.236 ...
## $ fractal_dimension_worst: num 0.1189 0.089 0.0876 0.173 0.0768 ...
## $ diagnosis : Factor w/ 2 levels "0","1": 2 2 2 2 2 2 2 2 2 2 ...
# Mean Area, Radius and Concavity vs Diagnosis
## Compare row 7 and column 8 with corr 0.921
## Means: 0.571 vs 0.389 so flagging column 7
## Compare row 8 and column 6 with corr 0.831
## Means: 0.542 vs 0.377 so flagging column 8
## Compare row 6 and column 28 with corr 0.816
## Means: 0.524 vs 0.365 so flagging column 6
## Compare row 28 and column 27 with corr 0.855
## Means: 0.507 vs 0.354 so flagging column 28
## Compare row 27 and column 26 with corr 0.892
## Means: 0.457 vs 0.343 so flagging column 27
## Compare row 23 and column 21 with corr 0.994
## Means: 0.456 vs 0.333 so flagging column 23
## Compare row 21 and column 3 with corr 0.969
## Means: 0.422 vs 0.324 so flagging column 21
## Compare row 3 and column 24 with corr 0.942
## Means: 0.384 vs 0.316 so flagging column 3
## Compare row 26 and column 30 with corr 0.81
## Means: 0.4 vs 0.313 so flagging column 26
## Compare row 24 and column 1 with corr 0.941
## Means: 0.356 vs 0.302 so flagging column 24
## Compare row 1 and column 4 with corr 0.987
## Means: 0.308 vs 0.298 so flagging column 1
## Compare row 4 and column 13 with corr 0.727
## Means: 0.27 vs 0.294 so flagging column 13
## Compare row 4 and column 11 with corr 0.733
## Means: 0.244 vs 0.29 so flagging column 11
## Compare row 4 and column 14 with corr 0.8
## Means: 0.213 vs 0.294 so flagging column 14
## Compare row 18 and column 16 with corr 0.744
## Means: 0.36 vs 0.292 so flagging column 18
## Compare row 16 and column 17 with corr 0.801
## Means: 0.394 vs 0.288 so flagging column 16
## Compare row 17 and column 20 with corr 0.727
## Means: 0.292 vs 0.272 so flagging column 17
## Compare row 5 and column 25 with corr 0.805
## Means: 0.33 vs 0.268 so flagging column 5
## Compare row 10 and column 30 with corr 0.767
## Means: 0.372 vs 0.256 so flagging column 10
## Compare row 22 and column 2 with corr 0.912
## Means: 0.253 vs 0.243 so flagging column 22
## All correlations <= 0.7
## Observations: 569
## Variables: 11
## $ texture_mean <dbl> 10.38, 17.77, 21.25, 20.38, 14.34, 15.70…
## $ area_mean <dbl> 1001.0, 1326.0, 1203.0, 386.1, 1297.0, 4…
## $ symmetry_mean <dbl> 0.2419, 0.1812, 0.2069, 0.2597, 0.1809, …
## $ texture_se <dbl> 0.9053, 0.7339, 0.7869, 1.1560, 0.7813, …
## $ smoothness_se <dbl> 0.006399, 0.005225, 0.006150, 0.009110, …
## $ symmetry_se <dbl> 0.03003, 0.01389, 0.02250, 0.05963, 0.01…
## $ fractal_dimension_se <dbl> 0.006193, 0.003532, 0.004571, 0.009208, …
## $ smoothness_worst <dbl> 0.1622, 0.1238, 0.1444, 0.2098, 0.1374, …
## $ symmetry_worst <dbl> 0.4601, 0.2750, 0.3613, 0.6638, 0.2364, …
## $ fractal_dimension_worst <dbl> 0.11890, 0.08902, 0.08758, 0.17300, 0.07…
## $ diagnosis <fct> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1…
## integer(0)
## integer(0)
## Importance of components:
## PC1 PC2 PC3 PC4 PC5 PC6
## Standard deviation 3.6444 2.3857 1.67867 1.40735 1.28403 1.09880
## Proportion of Variance 0.4427 0.1897 0.09393 0.06602 0.05496 0.04025
## Cumulative Proportion 0.4427 0.6324 0.72636 0.79239 0.84734 0.88759
## PC7 PC8 PC9 PC10 PC11 PC12
## Standard deviation 0.82172 0.69037 0.6457 0.59219 0.5421 0.51104
## Proportion of Variance 0.02251 0.01589 0.0139 0.01169 0.0098 0.00871
## Cumulative Proportion 0.91010 0.92598 0.9399 0.95157 0.9614 0.97007
## PC13 PC14 PC15 PC16 PC17 PC18
## Standard deviation 0.49128 0.39624 0.30681 0.28260 0.24372 0.22939
## Proportion of Variance 0.00805 0.00523 0.00314 0.00266 0.00198 0.00175
## Cumulative Proportion 0.97812 0.98335 0.98649 0.98915 0.99113 0.99288
## PC19 PC20 PC21 PC22 PC23 PC24
## Standard deviation 0.22244 0.17652 0.1731 0.16565 0.15602 0.1344
## Proportion of Variance 0.00165 0.00104 0.0010 0.00091 0.00081 0.0006
## Cumulative Proportion 0.99453 0.99557 0.9966 0.99749 0.99830 0.9989
## PC25 PC26 PC27 PC28 PC29 PC30
## Standard deviation 0.12442 0.09043 0.08307 0.03987 0.02736 0.01153
## Proportion of Variance 0.00052 0.00027 0.00023 0.00005 0.00002 0.00000
## Cumulative Proportion 0.99942 0.99969 0.99992 0.99997 1.00000 1.00000
#### PCA Proportion of Variance ################
VBrCanPCA <-BrCanPCA$sdev^2
PVBrCanEr <- VBrCanPCA/sum(VBrCanPCA)
PVBrCum <- cumsum(PVBrCanEr)
PVBrEr <- tibble(comp = seq(1:ncol(BrCanClean %>% select(-diagnosis))), PVBrCanEr, PVBrCum)
ggplot(PVBrEr, aes(x=comp,y=PVBrCum)) +
geom_point(color="orange", size = 3)+labs(x = "Components",
y = "Cumulative Proportion of Variance",
title = "Prinipal Variance Components")+
geom_abline(intercept = 0.95, color= "blue", slope = 0) #### ---------------------># plot to use in report
#### Principal Components Analysis - Correlation ################
LoBrCanPCA <- prcomp(BrCanData[,1:10], center=TRUE, scale=TRUE)
summary(LoBrCanPCA)
## Importance of components:
## PC1 PC2 PC3 PC4 PC5 PC6 PC7
## Standard deviation 1.7759 1.3824 1.1846 1.0590 0.88702 0.85521 0.61512
## Proportion of Variance 0.3154 0.1911 0.1403 0.1121 0.07868 0.07314 0.03784
## Cumulative Proportion 0.3154 0.5065 0.6468 0.7590 0.83764 0.91077 0.94861
## PC8 PC9 PC10
## Standard deviation 0.55416 0.33200 0.31078
## Proportion of Variance 0.03071 0.01102 0.00966
## Cumulative Proportion 0.97932 0.99034 1.00000
p<-plot(LoBrCanPCA, type="l", main='', col ="red", lty=1)
p+grid(nx = 10, ny = 14, col = "blue")
## integer(0)
p+title(main = "Principal Components Analysis - Correlation", sub = NULL, xlab = "Components")
## integer(0)
box(col="blue") #### ---------------------># plot to use in report
#### PCA Proportion of Variance ################
VBrCanPCA1 <-LoBrCanPCA$sdev^2
PVBrCanEr1 <- VBrCanPCA1/sum(VBrCanPCA1)
PVBrCum1 <- cumsum(PVBrCanEr1)
PVBrEr1 <- tibble(comp = seq(1:ncol(BrCanData %>% select(-diagnosis))), PVBrCanEr1, PVBrCum1)
ggplot(PVBrEr1, aes(x=comp,y=PVBrCum1)) +
geom_point(color="orange", size = 3)+labs(x = "Components",
y = "Cumulative Proportion of Variance",
title = "Prinipal Variance Components - Correlation")+
geom_abline(intercept = 0.95, color= "blue", slope = 0) #### ---------------------># plot to use in report
############ PCA Full Vs Correlation ##################
PCAfc <- as.data.frame(LoBrCanPCA$x)
ggplot(PCAfc, aes(x=PC1, y=PC2, col=BrCanClean$diagnosis)) + geom_point(alpha=0.5)
############ Most ifluential variables of PC1 & PC2 ####
autoplot(LoBrCanPCA, data = BrCanData, colour = 'diagnosis',
loadings = FALSE, loadings.label = TRUE, loadings.colour = "purple", loadings.label.colour = "black") #### ---------------------># plot to use in report
############ Seperate Diagnosis classes and Variance ####
BrCanPCS <- cbind(tibble::enframe(BrCanData$diagnosis), as_tibble(LoBrCanPCA$x))
GGally::ggpairs(BrCanPCS, columns = 2:4, ggplot2::aes(color = value)) #### ---------------------># plot to use in report
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
# Let's review Class Imbalnces.
table(BrCanData$diagnosis)
##
## 0 1
## 357 212
cm_plot <- function(ml, title) {
confusionMatrix(ml)$table %>%
round(1) %>%
fourfoldplot(
color = c("#CC6666", "#99CC99"),
main=title,
conf.level=0,
margin=1
)
}
library(factoextra)
## Welcome! Related Books: `Practical Guide To Cluster Analysis in R` at https://goo.gl/13EFCZ
summary(BrCanPCA)
## Importance of components:
## PC1 PC2 PC3 PC4 PC5 PC6
## Standard deviation 3.6444 2.3857 1.67867 1.40735 1.28403 1.09880
## Proportion of Variance 0.4427 0.1897 0.09393 0.06602 0.05496 0.04025
## Cumulative Proportion 0.4427 0.6324 0.72636 0.79239 0.84734 0.88759
## PC7 PC8 PC9 PC10 PC11 PC12
## Standard deviation 0.82172 0.69037 0.6457 0.59219 0.5421 0.51104
## Proportion of Variance 0.02251 0.01589 0.0139 0.01169 0.0098 0.00871
## Cumulative Proportion 0.91010 0.92598 0.9399 0.95157 0.9614 0.97007
## PC13 PC14 PC15 PC16 PC17 PC18
## Standard deviation 0.49128 0.39624 0.30681 0.28260 0.24372 0.22939
## Proportion of Variance 0.00805 0.00523 0.00314 0.00266 0.00198 0.00175
## Cumulative Proportion 0.97812 0.98335 0.98649 0.98915 0.99113 0.99288
## PC19 PC20 PC21 PC22 PC23 PC24
## Standard deviation 0.22244 0.17652 0.1731 0.16565 0.15602 0.1344
## Proportion of Variance 0.00165 0.00104 0.0010 0.00091 0.00081 0.0006
## Cumulative Proportion 0.99453 0.99557 0.9966 0.99749 0.99830 0.9989
## PC25 PC26 PC27 PC28 PC29 PC30
## Standard deviation 0.12442 0.09043 0.08307 0.03987 0.02736 0.01153
## Proportion of Variance 0.00052 0.00027 0.00023 0.00005 0.00002 0.00000
## Cumulative Proportion 0.99942 0.99969 0.99992 0.99997 1.00000 1.00000
fviz_eig(BrCanPCA, addlabels = TRUE, ylim = c(0,100), barfill = "steelblue1", line="navy") +
theme_classic() +
labs(x = "Principal Components", y = "% of Explained Variance", title = "BrCanPCA - Principal Components")
### We see that 44.3% of the variance is explained by the first principal component. ### The two first components explains the 0.6324 of the variance. We need 10 principal components to explain more than 0.95% of the variance and 17 to explain more than 0.99
library(factoextra)
summary(LoBrCanPCA)
## Importance of components:
## PC1 PC2 PC3 PC4 PC5 PC6 PC7
## Standard deviation 1.7759 1.3824 1.1846 1.0590 0.88702 0.85521 0.61512
## Proportion of Variance 0.3154 0.1911 0.1403 0.1121 0.07868 0.07314 0.03784
## Cumulative Proportion 0.3154 0.5065 0.6468 0.7590 0.83764 0.91077 0.94861
## PC8 PC9 PC10
## Standard deviation 0.55416 0.33200 0.31078
## Proportion of Variance 0.03071 0.01102 0.00966
## Cumulative Proportion 0.97932 0.99034 1.00000
fviz_eig(LoBrCanPCA, addlabels = TRUE, ylim = c(0,100), barfill = "steelblue1", line="navy") +
theme_classic() +
labs(x = "Principal Components", y = "% of Explained Variance", title = "LoBrCanPCA - Principal Components")
set.seed(1234)
train_indx <- createDataPartition(BrCanData$diagnosis, p = 0.7, list = FALSE)
train_set0 <- BrCanClean[train_indx,]
test_set0 <- BrCanClean[-train_indx,]
train_set <- BrCanData[train_indx,]
test_set <- BrCanData[-train_indx,]
testValues <-BrCanData$diagnosis.factor
nrow(train_set)
## [1] 399
nrow(test_set)
## [1] 170
BrCanModel <- glm(diagnosis ~ area_mean, data=train_set, family = "binomial")
summary(BrCanModel)
##
## Call:
## glm(formula = diagnosis ~ area_mean, family = "binomial", data = train_set)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -2.79420 -0.42289 -0.18900 0.08522 2.78740
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -8.493794 0.880489 -9.647 <2e-16 ***
## area_mean 0.012476 0.001387 8.996 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 527.28 on 398 degrees of freedom
## Residual deviance: 211.67 on 397 degrees of freedom
## AIC: 215.67
##
## Number of Fisher Scoring iterations: 7
plot(diagnosis ~ area_mean, data=train_set, main="Breast Cancer Regression Curve", ylab="Probability of Cancerous Tumor",
pch=16)
curve(
exp(BrCanModel$coef[1]+BrCanModel$coef[2]*x)/
(1+exp(BrCanModel$coef[1]+BrCanModel$coef[2]*x)),
add=TRUE
)
#install.packages("ResourceSelection")
#install.packages("pander")
library(ResourceSelection)
## ResourceSelection 0.3-5 2019-07-22
#library(pander)
hoslem.test(BrCanModel$y, BrCanModel$fitted.values)
##
## Hosmer and Lemeshow goodness of fit (GOF) test
##
## data: BrCanModel$y, BrCanModel$fitted.values
## X-squared = 12.114, df = 8, p-value = 0.1462
#pander()
logicFit0 <- glm(diagnosis ~ . , data=train_set0, family = binomial(link = "logit") , control = list(maxit = 50))
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
summary(logicFit0)
##
## Call:
## glm(formula = diagnosis ~ ., family = binomial(link = "logit"),
## data = train_set0, control = list(maxit = 50))
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -6.973e-06 -2.110e-08 -2.110e-08 2.110e-08 8.053e-06
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -9.553e+02 7.503e+06 0 1
## radius_mean -1.249e+03 6.546e+06 0 1
## texture_mean 2.987e+00 1.928e+05 0 1
## perimeter_mean 1.319e+02 1.061e+06 0 1
## area_mean 4.039e+00 1.839e+04 0 1
## smoothness_mean 7.356e+03 4.287e+07 0 1
## compactness_mean -9.159e+03 2.615e+07 0 1
## concavity_mean 1.834e+03 1.785e+07 0 1
## concave_points_mean -1.539e+03 3.164e+07 0 1
## symmetry_mean 7.905e+02 1.660e+07 0 1
## fractal_dimension_mean 1.115e+04 8.366e+07 0 1
## radius_se 1.155e+03 9.914e+06 0 1
## texture_se 4.865e+01 8.029e+05 0 1
## perimeter_se -2.104e+02 9.496e+05 0 1
## area_se 3.459e+00 8.209e+04 0 1
## smoothness_se 1.814e+03 1.620e+08 0 1
## compactness_se -2.407e+03 8.335e+07 0 1
## concavity_se 8.846e+02 3.198e+07 0 1
## concave_points_se 1.716e+04 1.055e+08 0 1
## symmetry_se -1.721e+03 6.262e+07 0 1
## fractal_dimension_se -7.426e+04 3.508e+08 0 1
## radius_worst 1.218e+02 1.794e+06 0 1
## texture_worst 5.149e+00 1.216e+05 0 1
## perimeter_worst 2.060e+01 2.015e+05 0 1
## area_worst -1.779e+00 1.104e+04 0 1
## smoothness_worst -2.637e+03 4.993e+07 0 1
## compactness_worst -1.277e+01 7.379e+06 0 1
## concavity_worst 1.126e+02 4.991e+06 0 1
## concave_points_worst 4.235e+02 1.382e+07 0 1
## symmetry_worst 2.016e+02 8.365e+06 0 1
## fractal_dimension_worst 7.532e+03 7.368e+07 0 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 5.2728e+02 on 398 degrees of freedom
## Residual deviance: 4.7503e-10 on 368 degrees of freedom
## AIC: 62
##
## Number of Fisher Scoring iterations: 31
confint(logicFit0)
## Waiting for profiling to be done...
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## 2.5 % 97.5 %
## (Intercept) -2.882463e+05 2.751460e+05
## radius_mean -2.519258e+05 2.396636e+05
## texture_mean -9.123054e+03 8.156416e+03
## perimeter_mean -3.401539e+04 3.532295e+04
## area_mean -7.458141e+02 7.538921e+02
## smoothness_mean -1.900029e+06 1.847477e+06
## compactness_mean -9.182101e+05 8.998927e+05
## concavity_mean -6.836607e+05 7.653721e+05
## concave_points_mean -1.062537e+06 1.094783e+06
## symmetry_mean -6.600467e+05 6.254670e+05
## fractal_dimension_mean -3.213638e+06 3.487890e+06
## radius_se -3.378776e+05 3.365212e+05
## texture_se -3.796298e+04 3.400918e+04
## perimeter_se -3.892483e+04 3.850404e+04
## area_se -3.148049e+03 3.513769e+03
## smoothness_se -6.720370e+06 6.516360e+06
## compactness_se -2.951952e+06 2.982483e+06
## concavity_se -1.226094e+06 1.137153e+06
## concave_points_se -4.561657e+06 4.370394e+06
## symmetry_se -3.072726e+06 3.986028e+06
## fractal_dimension_se -1.146201e+07 1.142799e+07
## radius_worst -7.132799e+04 7.625815e+04
## texture_worst -4.309823e+03 5.104662e+03
## perimeter_worst -7.561973e+03 8.042108e+03
## area_worst -4.928190e+02 4.719436e+02
## smoothness_worst -1.993220e+06 1.958862e+06
## compactness_worst -2.556637e+05 2.474008e+05
## concavity_worst -2.556836e+05 3.289934e+05
## concave_points_worst -5.629718e+05 5.638189e+05
## symmetry_worst -3.283951e+05 3.336707e+05
## fractal_dimension_worst -2.924861e+06 2.779466e+06
# VIF for covariance between texture, area, symmetry, smoothness, fractal_dimension
vif(logicFit0) -> logicFit0.vif
logicFit0.vif
## radius_mean texture_mean perimeter_mean
## 71311.06474 648.25482 85757.53964
## area_mean smoothness_mean compactness_mean
## 4418.11791 142.00378 600.35938
## concavity_mean concave_points_mean symmetry_mean
## 406.81464 188.95398 107.23957
## fractal_dimension_mean radius_se texture_se
## 124.78945 1246.50798 110.05390
## perimeter_se area_se smoothness_se
## 560.43448 715.88198 73.94513
## compactness_se concavity_se concave_points_se
## 684.15287 341.20532 154.49938
## symmetry_se fractal_dimension_se radius_worst
## 118.74676 305.78876 6091.82657
## texture_worst perimeter_worst area_worst
## 558.42181 3749.51367 2330.56145
## smoothness_worst compactness_worst concavity_worst
## 450.54476 614.30141 295.31954
## concave_points_worst symmetry_worst fractal_dimension_worst
## 125.00202 145.48540 897.94872
logicFit <- glm(diagnosis ~ . , data=train_set, family = binomial(link = "logit") , control = list(maxit = 50))
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
summary(logicFit)
##
## Call:
## glm(formula = diagnosis ~ ., family = binomial(link = "logit"),
## data = train_set, control = list(maxit = 50))
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -1.5719 -0.1231 -0.0184 0.0044 3.8150
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -4.255e+01 6.641e+00 -6.408 1.47e-10 ***
## texture_mean 2.166e-01 8.887e-02 2.437 0.0148 *
## area_mean 2.295e-02 3.566e-03 6.435 1.23e-10 ***
## symmetry_mean 1.715e+01 2.107e+01 0.814 0.4156
## texture_se 2.060e+00 8.656e-01 2.380 0.0173 *
## smoothness_se 3.114e+02 2.232e+02 1.395 0.1629
## symmetry_se -3.106e+01 6.283e+01 -0.494 0.6211
## fractal_dimension_se -1.217e+03 5.350e+02 -2.275 0.0229 *
## smoothness_worst 4.779e+01 3.037e+01 1.573 0.1156
## symmetry_worst 1.372e+01 1.124e+01 1.220 0.2224
## fractal_dimension_worst 1.200e+02 6.041e+01 1.986 0.0470 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 527.285 on 398 degrees of freedom
## Residual deviance: 83.142 on 388 degrees of freedom
## AIC: 105.14
##
## Number of Fisher Scoring iterations: 9
confint(logicFit)
## Waiting for profiling to be done...
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## 2.5 % 97.5 %
## (Intercept) -5.760536e+01 -31.21502128
## texture_mean 4.764176e-02 0.39944881
## area_mean 1.685838e-02 0.03101257
## symmetry_mean -2.396752e+01 59.53091596
## texture_se 3.657366e-01 3.81928267
## smoothness_se -1.153823e+02 758.74766276
## symmetry_se -1.495989e+02 103.97255277
## fractal_dimension_se -2.327408e+03 -225.88244460
## smoothness_worst -9.402002e+00 111.11062142
## symmetry_worst -8.290196e+00 36.38844131
## fractal_dimension_worst 6.261156e+00 245.01751277
# VIF for covariance between texture, area, symmetry, smoothness, fractal_dimension
vif(logicFit) -> logicFit.vif
logicFit.vif
## texture_mean area_mean symmetry_mean
## 1.922587 3.466306 3.584410
## texture_se smoothness_se symmetry_se
## 3.091858 10.385765 3.763719
## fractal_dimension_se smoothness_worst symmetry_worst
## 12.655190 5.385274 7.286488
## fractal_dimension_worst
## 11.221048
#Anova test
print(anova(logicFit, test="Chisq"))
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Analysis of Deviance Table
##
## Model: binomial, link: logit
##
## Response: diagnosis
##
## Terms added sequentially (first to last)
##
##
## Df Deviance Resid. Df Resid. Dev Pr(>Chi)
## NULL 398 527.28
## texture_mean 1 49.652 397 477.63 1.836e-12 ***
## area_mean 1 283.476 396 194.16 < 2.2e-16 ***
## symmetry_mean 1 48.294 395 145.86 3.669e-12 ***
## texture_se 1 0.392 394 145.47 0.53131
## smoothness_se 1 0.080 393 145.39 0.77683
## symmetry_se 1 1.161 392 144.23 0.28120
## fractal_dimension_se 1 0.018 391 144.21 0.89436
## smoothness_worst 1 51.337 390 92.87 7.779e-13 ***
## symmetry_worst 1 5.439 389 87.44 0.01969 *
## fractal_dimension_worst 1 4.293 388 83.14 0.03826 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
#Plots for the model
mlrplots(logicFit)
predTrain <- predict(logicFit, train_set, type="response")
predTrain <- ifelse(predTrain>0.5,1,0)
error = mean(predTrain != train_set$diagnosis)
print(paste('Model Accuracy', 1-error))
## [1] "Model Accuracy 0.949874686716792"
##"Model Accuracy 0.949874686716792"
library(ROCR)
p <- predict(logicFit, train_set, type="response")
pr <- prediction(p, train_set$diagnosis)
prf = performance(pr, measure="tpr", x.measure="fpr")
plot(prf)
predTest <- predict(logicFit, test_set, type="response")
predTest <- ifelse(predTest>0.5,1,0)
error = mean(predTest != test_set$diagnosis)
print(paste('Test Accuracy', 1-error))
## [1] "Test Accuracy 0.964705882352941"
##Test Accuracy 0.964705882352941"
#library(ROCR)
p1 <- predict(logicFit, test_set, type="response")
pr1 <- prediction(p1, test_set$diagnosis)
prf1 = performance(pr1, measure="tpr", x.measure="fpr")
plot(prf1)
BrCanPCAFull<-LoBrCanPCA$x
BrCanPCAFull<-data.frame(BrCanPCAFull)
BrCanPCAFull$diagnosis<-BrCanData$diagnosis
train_set_pca <- BrCanPCAFull[train_indx,]
test_set_pca <- BrCanPCAFull[-train_indx,]
testValues_pca <-BrCanPCAFull$diagnosis.factor
nrow(train_set_pca)
## [1] 399
nrow(test_set_pca)
## [1] 170
logicFitPCA <- glm(diagnosis ~ . , data=train_set_pca, family = binomial(link = "logit") , control = list(maxit = 50))
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
summary(logicFitPCA)
##
## Call:
## glm(formula = diagnosis ~ ., family = binomial(link = "logit"),
## data = train_set_pca, control = list(maxit = 50))
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -1.5719 -0.1231 -0.0184 0.0044 3.8150
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -0.4192 0.3330 -1.259 0.208068
## PC1 1.6158 0.2775 5.823 5.77e-09 ***
## PC2 2.9015 0.5058 5.736 9.68e-09 ***
## PC3 5.5743 0.8857 6.294 3.10e-10 ***
## PC4 0.4313 0.2765 1.560 0.118877
## PC5 2.8127 0.7664 3.670 0.000243 ***
## PC6 -3.6740 0.6411 -5.731 1.00e-08 ***
## PC7 0.5873 0.4786 1.227 0.219723
## PC8 -2.5437 0.7424 -3.426 0.000612 ***
## PC9 -3.2176 1.4975 -2.149 0.031659 *
## PC10 -2.1542 1.3947 -1.545 0.122440
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 527.285 on 398 degrees of freedom
## Residual deviance: 83.142 on 388 degrees of freedom
## AIC: 105.14
##
## Number of Fisher Scoring iterations: 9
confint(logicFitPCA)
## Waiting for profiling to be done...
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## 2.5 % 97.5 %
## (Intercept) -1.07733509 0.2441100
## PC1 1.13245192 2.2344907
## PC2 2.03560095 4.0370066
## PC3 4.06835566 7.5819244
## PC4 -0.09856318 1.0019745
## PC5 1.45161052 4.4647172
## PC6 -5.11323233 -2.5624424
## PC7 -0.34944422 1.5393649
## PC8 -4.15531891 -1.1991907
## PC9 -6.29987250 -0.4263754
## PC10 -5.01694635 0.5203752
# VIF for covariance between texture, area, symmetry, smoothness, fractal_dimension
vif(logicFitPCA) -> logicFitPCA.vif
logicFitPCA.vif
## PC1 PC2 PC3 PC4 PC5 PC6 PC7 PC8
## 2.574073 6.752508 9.034047 1.473490 3.817439 3.929230 1.347803 1.918338
## PC9 PC10
## 3.537856 2.224204
#Plots for the model
mlrplots(logicFitPCA)
predTrainPCA <- predict(logicFitPCA, train_set_pca, type="response")
predTrainPCA <- ifelse(predTrainPCA>0.5,1,0)
error = mean(predTrainPCA != train_set_pca$diagnosis)
print(paste('PCA Model Accuracy', 1-error))
## [1] "PCA Model Accuracy 0.949874686716792"
##"PCA Model Accuracy 0.949874686716792"
predTestPCA <- predict(logicFitPCA, test_set_pca, type="response")
predTestPCA <- ifelse(predTestPCA>0.5,1,0)
error = mean(predTestPCA != test_set_pca$diagnosis)
print(paste('PCA Test Accuracy', 1-error))
## [1] "PCA Test Accuracy 0.964705882352941"
##"PCATest Accuracy 0.964705882352941"
library(glmnetUtils)
x <- model.matrix(diagnosis~.,train_set)[,-1]
#y <- BrCan$diagnosis
y <- as.factor(as.character(train_set$diagnosis))
# Fit models
#fit.lasso <- glmnet(x, y, family="gaussian", alpha=1) #fit LASSO
logicFitNet <- glmnet(x, y, family = "binomial", alpha=.5) #fit Elastic Net
cv <- cv.glmnet(x,y, type.measure = "class",, alpha = 0.5, family = "binomial",)
predict.elnet(cv,type="coefficients")
## 11 x 1 sparse Matrix of class "dgCMatrix"
## 1
## (Intercept) -9.45477883
## texture_mean 0.06540105
## area_mean 0.00436924
## symmetry_mean 1.46251907
## texture_se .
## smoothness_se .
## symmetry_se .
## fractal_dimension_se .
## smoothness_worst 20.90320582
## symmetry_worst 4.79426102
## fractal_dimension_worst 3.72242618
plot(cv, main="Elastic Net")
#library(glmnetUtils)
x <- model.matrix(diagnosis~.,BrCanPCAFull)[,-1]
#y <- BrCan$diagnosis
y <- as.factor(as.character(BrCanPCAFull$diagnosis))
# Fit models
#fit.lasso <- glmnet(x, y, family="gaussian", alpha=1) #fit LASSO
logicFitNet <- glmnet(x, y, family = "binomial", alpha=.5) #fit Elastic Net
cv <- cv.glmnet(x,y, type.measure = "class",, alpha = 0.5, family = "binomial",)
plot(cv, main="Elastic Net using PCA transformed Data")
############ Setting up 10-fold cross-validation ####
ctrl <- trainControl("cv",number=10)
set.seed(1234)
logit.ml<-train(diagnosis ~ ., data = train_set_pca, method="glm")
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
logit.cm <- confusionMatrix(logit.ml)
summary(logit.ml)
##
## Call:
## NULL
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -1.5719 -0.1231 -0.0184 0.0044 3.8150
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -0.4192 0.3330 -1.259 0.208068
## PC1 1.6158 0.2775 5.823 5.77e-09 ***
## PC2 2.9015 0.5058 5.736 9.68e-09 ***
## PC3 5.5743 0.8857 6.294 3.10e-10 ***
## PC4 0.4313 0.2765 1.560 0.118877
## PC5 2.8127 0.7664 3.670 0.000243 ***
## PC6 -3.6740 0.6411 -5.731 1.00e-08 ***
## PC7 0.5873 0.4786 1.227 0.219723
## PC8 -2.5437 0.7424 -3.426 0.000612 ***
## PC9 -3.2176 1.4975 -2.149 0.031659 *
## PC10 -2.1542 1.3947 -1.545 0.122440
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 527.285 on 398 degrees of freedom
## Residual deviance: 83.142 on 388 degrees of freedom
## AIC: 105.14
##
## Number of Fisher Scoring iterations: 9
cm_plot(logit.ml, "Logistic Regression") #### ---------------------># plot to use in report
logit.metrics <- data.frame (
"Model" = "Logistic Regression",
"Accuracy" = (logit.cm$table[1,1] + logit.cm$table[2,2])/100,
"Recall" = logit.cm$table[2,2] / (logit.cm$table[2,2] + logit.cm$table[1,2]),
"Precision" = logit.cm$table[2,2] / (logit.cm$table[2,1] + logit.cm$table[2,2]),
"FNR" = (logit.cm$table[1,2] / (logit.cm$table[2,2] + logit.cm$table[1,2])),
"Fscore" = (2 * logit.cm$table[2,2]) / (2 * logit.cm$table[2,2] + logit.cm$table[1,2] + logit.cm$table[2,1])
)
logit.metrics
## Model Accuracy Recall Precision FNR Fscore
## 1 Logistic Regression 0.9446433 0.9058026 0.9405321 0.09419744 0.9228407
rfPCA.ml<-train(diagnosis ~ ., data = train_set_pca, method = "rf", trControl=ctrl, importance=FALSE)
rfPCA.cm <- confusionMatrix(rfPCA.ml)
cm_plot(rfPCA.ml, "Random Forest with PCA")
#### ---------------------># plot to use in report
rfPCA.metrics <- data.frame (
"Model" = "Random Forest with PCA",
"Accuracy" = (rfPCA.cm$table[1,1] + rfPCA.cm$table[2,2])/100,
"Recall" = rfPCA.cm$table[2,2] / (rfPCA.cm$table[2,2] + rfPCA.cm$table[1,2]),
"Precision" = rfPCA.cm$table[2,2] / (rfPCA.cm$table[2,1] + rfPCA.cm$table[2,2]),
"FNR" = (rfPCA.cm$table[1,2] / (rfPCA.cm$table[2,2] + rfPCA.cm$table[1,2])),
"Fscore" = (2 * rfPCA.cm$table[2,2]) / (2 * rfPCA.cm$table[2,2] + rfPCA.cm$table[1,2] + rfPCA.cm$table[2,1])
)
rfPCA.metrics
## Model Accuracy Recall Precision FNR Fscore
## 1 Random Forest with PCA 0.9172932 0.8926174 0.8866667 0.1073826 0.8896321
plot(varImp(rfPCA.ml), top = 10, main = "Random forest with PCA")
rf.ml<-train(diagnosis ~ ., data = train_set, method = "rf", trControl=ctrl, importance=FALSE)
rf.cm <- confusionMatrix(rf.ml)
cm_plot(rf.ml, "Random Forest without PCA") #### ---------------------># plot to use in report
rf.metrics <- data.frame (
"Model" = "Random Forest",
"Accuracy" = (rf.cm$table[1,1] + rf.cm$table[2,2])/100,
"Recall" = rf.cm$table[2,2] / (rf.cm$table[2,2] + rf.cm$table[1,2]),
"Precision" = rf.cm$table[2,2] / (rf.cm$table[2,1] + rf.cm$table[2,2]),
"FNR" = (rf.cm$table[1,2] / (rf.cm$table[2,2] + rf.cm$table[1,2])),
"Fscore" = (2 * rf.cm$table[2,2]) / (2 * rf.cm$table[2,2] + rf.cm$table[1,2] + rf.cm$table[2,1])
)
rf.metrics
## Model Accuracy Recall Precision FNR Fscore
## 1 Random Forest 0.962406 0.9261745 0.971831 0.0738255 0.9484536
plot(varImp(rf.ml), top = 10, main = "Random forest without PCA")
library(dplyr)
library(ggraph)
library(igraph)
##
## Attaching package: 'igraph'
## The following object is masked from 'package:formattable':
##
## normalize
## The following objects are masked from 'package:dplyr':
##
## as_data_frame, groups, union
## The following objects are masked from 'package:purrr':
##
## compose, simplify
## The following object is masked from 'package:tidyr':
##
## crossing
## The following object is masked from 'package:tibble':
##
## as_data_frame
## The following objects are masked from 'package:stats':
##
## decompose, spectrum
## The following object is masked from 'package:base':
##
## union
library(randomForest)
## randomForest 4.6-14
## Type rfNews() to see new features/changes/bug fixes.
##
## Attaching package: 'randomForest'
## The following object is masked from 'package:gridExtra':
##
## combine
## The following object is masked from 'package:dplyr':
##
## combine
## The following object is masked from 'package:ggplot2':
##
## margin
tree_func <- function(final_model,
tree_num) {
# get tree by index
tree <- randomForest::getTree(final_model,
k = tree_num,
labelVar = TRUE) %>%
tibble::rownames_to_column() %>%
# make leaf split points to NA, so the 0s won't get plotted
mutate(`split point` = ifelse(is.na(prediction), `split point`, NA))
# prepare data frame for graph
graph_frame <- data.frame(from = rep(tree$rowname, 2),
to = c(tree$`left daughter`, tree$`right daughter`))
# convert to graph and delete the last node that we don't want to plot
graph <- graph_from_data_frame(graph_frame) %>%
delete_vertices("0")
# set node labels
V(graph)$node_label <- gsub("_", " ", as.character(tree$`split var`))
V(graph)$leaf_label <- as.character(tree$prediction)
V(graph)$split <- as.character(round(tree$`split point`, digits = 2))
# plot
plot <- ggraph(graph, 'dendrogram') +
theme_bw() +
geom_edge_link() +
geom_node_point() +
geom_node_text(aes(label = node_label), na.rm = TRUE, repel = TRUE) +
geom_node_label(aes(label = split), vjust = 2.5, na.rm = TRUE, fill = "white") +
geom_node_label(aes(label = leaf_label, fill = leaf_label), na.rm = TRUE,
repel = TRUE, colour = "white", fontface = "bold", show.legend = FALSE) +
theme(panel.grid.minor = element_blank(),
panel.grid.major = element_blank(),
panel.background = element_blank(),
plot.background = element_rect(fill = "white"),
panel.border = element_blank(),
axis.line = element_blank(),
axis.text.x = element_blank(),
axis.text.y = element_blank(),
axis.ticks = element_blank(),
axis.title.x = element_blank(),
axis.title.y = element_blank(),
plot.title = element_text(size = 18))
print(plot)
}
library(party)
## Loading required package: grid
## Loading required package: mvtnorm
## Loading required package: modeltools
## Loading required package: stats4
##
## Attaching package: 'modeltools'
## The following object is masked from 'package:igraph':
##
## clusters
## The following object is masked from 'package:car':
##
## Predict
## The following object is masked from 'package:kernlab':
##
## prior
## Loading required package: strucchange
## Loading required package: zoo
##
## Attaching package: 'zoo'
## The following objects are masked from 'package:base':
##
## as.Date, as.Date.numeric
## Loading required package: sandwich
##
## Attaching package: 'strucchange'
## The following object is masked from 'package:stringr':
##
## boundary
model<-randomForest(diagnosis ~., data = train_set, importance = TRUE, ntree=50, mtry=2, do.trace=100)
tree_num =10;
tree_func(model, tree_num)
## Warning: Duplicated aesthetics after name standardisation: na.rm
## Warning: Duplicated aesthetics after name standardisation: na.rm
## Warning: Duplicated aesthetics after name standardisation: na.rm
## Warning: Removed 36 rows containing missing values (geom_text_repel).
## Warning: Removed 36 rows containing missing values (geom_label).
## Warning: Removed 35 rows containing missing values (geom_label_repel).
#plot(x,"simple")
knn.ml <- train(diagnosis~., data = train_set_pca, method = "knn", trControl =ctrl)
knn.cm <- confusionMatrix(knn.ml)
cm_plot(knn.ml, "KNN Model") #### ---------------------># plot to use in report
knn.metrics <- data.frame (
"Model" = "KNN",
"Accuracy" = (knn.cm$table[1,1] + knn.cm$table[2,2])/100,
"Recall" = knn.cm$table[2,2] / (knn.cm$table[2,2] + knn.cm$table[1,2]),
"Precision" = knn.cm$table[2,2] / (knn.cm$table[2,1] + knn.cm$table[2,2]),
"FNR" = (knn.cm$table[1,2] / (knn.cm$table[2,2] + knn.cm$table[1,2])),
"Fscore" = (2 * knn.cm$table[2,2]) / (2 * knn.cm$table[2,2] + knn.cm$table[1,2] + knn.cm$table[2,1])
)
knn.metrics
## Model Accuracy Recall Precision FNR Fscore
## 1 KNN 0.9223058 0.8389262 0.9469697 0.1610738 0.8896797
library(gbm)
gbm.ml <- train(diagnosis~., data = train_set_pca, method = "gbm", trControl =ctrl)
## Iter TrainDeviance ValidDeviance StepSize Improve
## 1 1.2750 nan 0.1000 0.0159
## 2 1.2219 nan 0.1000 0.0240
## 3 1.1884 nan 0.1000 0.0153
## 4 1.1495 nan 0.1000 0.0183
## 5 1.1209 nan 0.1000 0.0128
## 6 1.0959 nan 0.1000 0.0094
## 7 1.0515 nan 0.1000 0.0131
## 8 1.0218 nan 0.1000 0.0122
## 9 0.9961 nan 0.1000 0.0114
## 10 0.9725 nan 0.1000 0.0056
## 20 0.8005 nan 0.1000 0.0034
## 40 0.6020 nan 0.1000 0.0015
## 60 0.4935 nan 0.1000 0.0011
## 80 0.4121 nan 0.1000 0.0007
## 100 0.3559 nan 0.1000 -0.0013
## 120 0.3092 nan 0.1000 0.0000
## 140 0.2730 nan 0.1000 0.0002
## 150 0.2553 nan 0.1000 0.0001
##
## Iter TrainDeviance ValidDeviance StepSize Improve
## 1 1.2410 nan 0.1000 0.0357
## 2 1.1530 nan 0.1000 0.0366
## 3 1.1026 nan 0.1000 0.0232
## 4 1.0520 nan 0.1000 0.0226
## 5 0.9987 nan 0.1000 0.0195
## 6 0.9598 nan 0.1000 0.0150
## 7 0.9202 nan 0.1000 0.0133
## 8 0.8817 nan 0.1000 0.0189
## 9 0.8471 nan 0.1000 0.0147
## 10 0.8162 nan 0.1000 0.0126
## 20 0.5947 nan 0.1000 0.0050
## 40 0.3765 nan 0.1000 0.0029
## 60 0.2679 nan 0.1000 0.0004
## 80 0.2078 nan 0.1000 -0.0002
## 100 0.1611 nan 0.1000 -0.0000
## 120 0.1262 nan 0.1000 -0.0002
## 140 0.1015 nan 0.1000 -0.0004
## 150 0.0913 nan 0.1000 -0.0003
##
## Iter TrainDeviance ValidDeviance StepSize Improve
## 1 1.2119 nan 0.1000 0.0542
## 2 1.1289 nan 0.1000 0.0389
## 3 1.0520 nan 0.1000 0.0346
## 4 0.9910 nan 0.1000 0.0252
## 5 0.9204 nan 0.1000 0.0285
## 6 0.8636 nan 0.1000 0.0226
## 7 0.8106 nan 0.1000 0.0201
## 8 0.7710 nan 0.1000 0.0171
## 9 0.7405 nan 0.1000 0.0109
## 10 0.7018 nan 0.1000 0.0166
## 20 0.4733 nan 0.1000 0.0040
## 40 0.2712 nan 0.1000 0.0014
## 60 0.1779 nan 0.1000 -0.0001
## 80 0.1243 nan 0.1000 -0.0005
## 100 0.0928 nan 0.1000 0.0001
## 120 0.0686 nan 0.1000 -0.0006
## 140 0.0519 nan 0.1000 -0.0001
## 150 0.0458 nan 0.1000 -0.0002
##
## Iter TrainDeviance ValidDeviance StepSize Improve
## 1 1.2751 nan 0.1000 0.0233
## 2 1.2307 nan 0.1000 0.0194
## 3 1.1914 nan 0.1000 0.0122
## 4 1.1602 nan 0.1000 0.0140
## 5 1.1279 nan 0.1000 0.0147
## 6 1.1014 nan 0.1000 0.0108
## 7 1.0778 nan 0.1000 0.0069
## 8 1.0530 nan 0.1000 0.0088
## 9 1.0255 nan 0.1000 0.0127
## 10 1.0054 nan 0.1000 0.0060
## 20 0.8305 nan 0.1000 0.0057
## 40 0.6363 nan 0.1000 0.0015
## 60 0.5254 nan 0.1000 0.0002
## 80 0.4486 nan 0.1000 0.0004
## 100 0.3884 nan 0.1000 0.0008
## 120 0.3395 nan 0.1000 0.0002
## 140 0.3047 nan 0.1000 -0.0005
## 150 0.2891 nan 0.1000 -0.0004
##
## Iter TrainDeviance ValidDeviance StepSize Improve
## 1 1.2481 nan 0.1000 0.0293
## 2 1.1834 nan 0.1000 0.0293
## 3 1.1176 nan 0.1000 0.0260
## 4 1.0648 nan 0.1000 0.0207
## 5 1.0129 nan 0.1000 0.0205
## 6 0.9755 nan 0.1000 0.0148
## 7 0.9322 nan 0.1000 0.0143
## 8 0.8985 nan 0.1000 0.0136
## 9 0.8686 nan 0.1000 0.0115
## 10 0.8405 nan 0.1000 0.0100
## 20 0.6317 nan 0.1000 0.0062
## 40 0.4271 nan 0.1000 0.0006
## 60 0.3147 nan 0.1000 0.0008
## 80 0.2462 nan 0.1000 -0.0002
## 100 0.1947 nan 0.1000 -0.0006
## 120 0.1591 nan 0.1000 0.0000
## 140 0.1328 nan 0.1000 -0.0005
## 150 0.1197 nan 0.1000 -0.0007
##
## Iter TrainDeviance ValidDeviance StepSize Improve
## 1 1.2073 nan 0.1000 0.0476
## 2 1.1208 nan 0.1000 0.0407
## 3 1.0495 nan 0.1000 0.0274
## 4 0.9871 nan 0.1000 0.0258
## 5 0.9299 nan 0.1000 0.0265
## 6 0.8824 nan 0.1000 0.0183
## 7 0.8453 nan 0.1000 0.0146
## 8 0.8055 nan 0.1000 0.0161
## 9 0.7721 nan 0.1000 0.0152
## 10 0.7382 nan 0.1000 0.0137
## 20 0.5052 nan 0.1000 0.0057
## 40 0.2989 nan 0.1000 -0.0001
## 60 0.2047 nan 0.1000 -0.0008
## 80 0.1480 nan 0.1000 -0.0012
## 100 0.1090 nan 0.1000 -0.0002
## 120 0.0842 nan 0.1000 -0.0005
## 140 0.0643 nan 0.1000 0.0002
## 150 0.0574 nan 0.1000 -0.0001
##
## Iter TrainDeviance ValidDeviance StepSize Improve
## 1 1.2622 nan 0.1000 0.0278
## 2 1.2141 nan 0.1000 0.0202
## 3 1.1774 nan 0.1000 0.0159
## 4 1.1480 nan 0.1000 0.0113
## 5 1.1113 nan 0.1000 0.0153
## 6 1.0871 nan 0.1000 0.0104
## 7 1.0583 nan 0.1000 0.0136
## 8 1.0342 nan 0.1000 0.0107
## 9 1.0112 nan 0.1000 0.0085
## 10 0.9864 nan 0.1000 0.0090
## 20 0.8262 nan 0.1000 0.0035
## 40 0.6359 nan 0.1000 0.0015
## 60 0.5159 nan 0.1000 0.0008
## 80 0.4291 nan 0.1000 0.0017
## 100 0.3668 nan 0.1000 0.0008
## 120 0.3242 nan 0.1000 -0.0004
## 140 0.2792 nan 0.1000 0.0000
## 150 0.2627 nan 0.1000 -0.0003
##
## Iter TrainDeviance ValidDeviance StepSize Improve
## 1 1.2360 nan 0.1000 0.0374
## 2 1.1648 nan 0.1000 0.0321
## 3 1.0984 nan 0.1000 0.0293
## 4 1.0507 nan 0.1000 0.0184
## 5 1.0046 nan 0.1000 0.0199
## 6 0.9531 nan 0.1000 0.0229
## 7 0.9129 nan 0.1000 0.0152
## 8 0.8703 nan 0.1000 0.0169
## 9 0.8407 nan 0.1000 0.0119
## 10 0.8157 nan 0.1000 0.0096
## 20 0.6062 nan 0.1000 0.0055
## 40 0.3927 nan 0.1000 0.0002
## 60 0.2877 nan 0.1000 0.0005
## 80 0.2207 nan 0.1000 -0.0004
## 100 0.1749 nan 0.1000 -0.0001
## 120 0.1409 nan 0.1000 -0.0007
## 140 0.1177 nan 0.1000 0.0001
## 150 0.1079 nan 0.1000 -0.0007
##
## Iter TrainDeviance ValidDeviance StepSize Improve
## 1 1.2090 nan 0.1000 0.0514
## 2 1.1211 nan 0.1000 0.0394
## 3 1.0449 nan 0.1000 0.0351
## 4 0.9823 nan 0.1000 0.0282
## 5 0.9200 nan 0.1000 0.0249
## 6 0.8735 nan 0.1000 0.0216
## 7 0.8339 nan 0.1000 0.0159
## 8 0.7958 nan 0.1000 0.0157
## 9 0.7578 nan 0.1000 0.0130
## 10 0.7254 nan 0.1000 0.0142
## 20 0.4927 nan 0.1000 0.0046
## 40 0.2997 nan 0.1000 0.0004
## 60 0.2081 nan 0.1000 -0.0009
## 80 0.1524 nan 0.1000 0.0000
## 100 0.1144 nan 0.1000 -0.0003
## 120 0.0876 nan 0.1000 -0.0007
## 140 0.0682 nan 0.1000 -0.0004
## 150 0.0603 nan 0.1000 -0.0003
##
## Iter TrainDeviance ValidDeviance StepSize Improve
## 1 1.2722 nan 0.1000 0.0251
## 2 1.2280 nan 0.1000 0.0197
## 3 1.1907 nan 0.1000 0.0165
## 4 1.1622 nan 0.1000 0.0113
## 5 1.1290 nan 0.1000 0.0146
## 6 1.1010 nan 0.1000 0.0140
## 7 1.0754 nan 0.1000 0.0089
## 8 1.0470 nan 0.1000 0.0124
## 9 1.0234 nan 0.1000 0.0084
## 10 0.9998 nan 0.1000 0.0067
## 20 0.8348 nan 0.1000 0.0023
## 40 0.6374 nan 0.1000 0.0003
## 60 0.5225 nan 0.1000 -0.0004
## 80 0.4385 nan 0.1000 0.0001
## 100 0.3805 nan 0.1000 0.0004
## 120 0.3377 nan 0.1000 -0.0006
## 140 0.2983 nan 0.1000 0.0002
## 150 0.2834 nan 0.1000 -0.0002
##
## Iter TrainDeviance ValidDeviance StepSize Improve
## 1 1.2358 nan 0.1000 0.0399
## 2 1.1636 nan 0.1000 0.0328
## 3 1.1062 nan 0.1000 0.0280
## 4 1.0626 nan 0.1000 0.0166
## 5 1.0067 nan 0.1000 0.0216
## 6 0.9659 nan 0.1000 0.0176
## 7 0.9291 nan 0.1000 0.0163
## 8 0.8947 nan 0.1000 0.0121
## 9 0.8658 nan 0.1000 0.0103
## 10 0.8386 nan 0.1000 0.0091
## 20 0.6205 nan 0.1000 0.0029
## 40 0.4102 nan 0.1000 0.0017
## 60 0.2993 nan 0.1000 0.0007
## 80 0.2296 nan 0.1000 -0.0019
## 100 0.1755 nan 0.1000 -0.0007
## 120 0.1454 nan 0.1000 -0.0003
## 140 0.1179 nan 0.1000 -0.0002
## 150 0.1074 nan 0.1000 -0.0005
##
## Iter TrainDeviance ValidDeviance StepSize Improve
## 1 1.2210 nan 0.1000 0.0498
## 2 1.1242 nan 0.1000 0.0456
## 3 1.0527 nan 0.1000 0.0273
## 4 0.9907 nan 0.1000 0.0294
## 5 0.9378 nan 0.1000 0.0237
## 6 0.8823 nan 0.1000 0.0246
## 7 0.8434 nan 0.1000 0.0156
## 8 0.8051 nan 0.1000 0.0136
## 9 0.7685 nan 0.1000 0.0103
## 10 0.7376 nan 0.1000 0.0116
## 20 0.5094 nan 0.1000 0.0079
## 40 0.3080 nan 0.1000 -0.0011
## 60 0.2055 nan 0.1000 -0.0001
## 80 0.1518 nan 0.1000 0.0002
## 100 0.1140 nan 0.1000 -0.0002
## 120 0.0884 nan 0.1000 -0.0002
## 140 0.0693 nan 0.1000 -0.0003
## 150 0.0625 nan 0.1000 -0.0002
##
## Iter TrainDeviance ValidDeviance StepSize Improve
## 1 1.2680 nan 0.1000 0.0270
## 2 1.2274 nan 0.1000 0.0160
## 3 1.1831 nan 0.1000 0.0214
## 4 1.1471 nan 0.1000 0.0172
## 5 1.1108 nan 0.1000 0.0138
## 6 1.0824 nan 0.1000 0.0108
## 7 1.0502 nan 0.1000 0.0118
## 8 1.0239 nan 0.1000 0.0106
## 9 1.0010 nan 0.1000 0.0092
## 10 0.9780 nan 0.1000 0.0079
## 20 0.8122 nan 0.1000 0.0060
## 40 0.6086 nan 0.1000 0.0003
## 60 0.4903 nan 0.1000 -0.0000
## 80 0.4118 nan 0.1000 0.0002
## 100 0.3544 nan 0.1000 0.0003
## 120 0.3098 nan 0.1000 -0.0002
## 140 0.2723 nan 0.1000 -0.0002
## 150 0.2597 nan 0.1000 -0.0002
##
## Iter TrainDeviance ValidDeviance StepSize Improve
## 1 1.2382 nan 0.1000 0.0332
## 2 1.1595 nan 0.1000 0.0361
## 3 1.0973 nan 0.1000 0.0298
## 4 1.0454 nan 0.1000 0.0236
## 5 0.9975 nan 0.1000 0.0153
## 6 0.9574 nan 0.1000 0.0137
## 7 0.9205 nan 0.1000 0.0120
## 8 0.8861 nan 0.1000 0.0115
## 9 0.8570 nan 0.1000 0.0088
## 10 0.8225 nan 0.1000 0.0119
## 20 0.6018 nan 0.1000 0.0051
## 40 0.3946 nan 0.1000 0.0012
## 60 0.2873 nan 0.1000 0.0007
## 80 0.2233 nan 0.1000 -0.0006
## 100 0.1735 nan 0.1000 -0.0002
## 120 0.1402 nan 0.1000 -0.0003
## 140 0.1170 nan 0.1000 -0.0007
## 150 0.1082 nan 0.1000 -0.0005
##
## Iter TrainDeviance ValidDeviance StepSize Improve
## 1 1.2149 nan 0.1000 0.0430
## 2 1.1223 nan 0.1000 0.0414
## 3 1.0595 nan 0.1000 0.0218
## 4 0.9878 nan 0.1000 0.0301
## 5 0.9321 nan 0.1000 0.0264
## 6 0.8845 nan 0.1000 0.0153
## 7 0.8326 nan 0.1000 0.0236
## 8 0.7959 nan 0.1000 0.0141
## 9 0.7628 nan 0.1000 0.0129
## 10 0.7248 nan 0.1000 0.0132
## 20 0.4995 nan 0.1000 0.0053
## 40 0.2917 nan 0.1000 -0.0004
## 60 0.1992 nan 0.1000 0.0008
## 80 0.1442 nan 0.1000 0.0001
## 100 0.1100 nan 0.1000 -0.0004
## 120 0.0844 nan 0.1000 -0.0001
## 140 0.0679 nan 0.1000 -0.0006
## 150 0.0602 nan 0.1000 -0.0006
##
## Iter TrainDeviance ValidDeviance StepSize Improve
## 1 1.2626 nan 0.1000 0.0274
## 2 1.2122 nan 0.1000 0.0208
## 3 1.1720 nan 0.1000 0.0180
## 4 1.1416 nan 0.1000 0.0148
## 5 1.1074 nan 0.1000 0.0139
## 6 1.0785 nan 0.1000 0.0100
## 7 1.0480 nan 0.1000 0.0148
## 8 1.0246 nan 0.1000 0.0085
## 9 1.0046 nan 0.1000 0.0067
## 10 0.9839 nan 0.1000 0.0093
## 20 0.8212 nan 0.1000 0.0032
## 40 0.6298 nan 0.1000 0.0017
## 60 0.5087 nan 0.1000 0.0013
## 80 0.4297 nan 0.1000 0.0006
## 100 0.3720 nan 0.1000 0.0014
## 120 0.3246 nan 0.1000 -0.0007
## 140 0.2911 nan 0.1000 -0.0000
## 150 0.2762 nan 0.1000 -0.0017
##
## Iter TrainDeviance ValidDeviance StepSize Improve
## 1 1.2381 nan 0.1000 0.0388
## 2 1.1650 nan 0.1000 0.0307
## 3 1.1124 nan 0.1000 0.0181
## 4 1.0644 nan 0.1000 0.0213
## 5 1.0057 nan 0.1000 0.0233
## 6 0.9516 nan 0.1000 0.0212
## 7 0.9088 nan 0.1000 0.0195
## 8 0.8748 nan 0.1000 0.0099
## 9 0.8446 nan 0.1000 0.0097
## 10 0.8150 nan 0.1000 0.0099
## 20 0.5983 nan 0.1000 0.0067
## 40 0.3946 nan 0.1000 -0.0002
## 60 0.2831 nan 0.1000 0.0004
## 80 0.2133 nan 0.1000 -0.0001
## 100 0.1740 nan 0.1000 -0.0007
## 120 0.1382 nan 0.1000 -0.0002
## 140 0.1145 nan 0.1000 -0.0008
## 150 0.1040 nan 0.1000 -0.0001
##
## Iter TrainDeviance ValidDeviance StepSize Improve
## 1 1.2117 nan 0.1000 0.0493
## 2 1.1233 nan 0.1000 0.0325
## 3 1.0427 nan 0.1000 0.0361
## 4 0.9912 nan 0.1000 0.0183
## 5 0.9287 nan 0.1000 0.0268
## 6 0.8903 nan 0.1000 0.0129
## 7 0.8435 nan 0.1000 0.0154
## 8 0.8111 nan 0.1000 0.0116
## 9 0.7724 nan 0.1000 0.0126
## 10 0.7310 nan 0.1000 0.0160
## 20 0.5142 nan 0.1000 0.0042
## 40 0.3034 nan 0.1000 0.0008
## 60 0.2030 nan 0.1000 0.0005
## 80 0.1442 nan 0.1000 -0.0001
## 100 0.1063 nan 0.1000 -0.0006
## 120 0.0797 nan 0.1000 -0.0000
## 140 0.0617 nan 0.1000 -0.0003
## 150 0.0543 nan 0.1000 -0.0003
##
## Iter TrainDeviance ValidDeviance StepSize Improve
## 1 1.2681 nan 0.1000 0.0277
## 2 1.2257 nan 0.1000 0.0220
## 3 1.1896 nan 0.1000 0.0181
## 4 1.1554 nan 0.1000 0.0143
## 5 1.1301 nan 0.1000 0.0090
## 6 1.0934 nan 0.1000 0.0198
## 7 1.0645 nan 0.1000 0.0094
## 8 1.0401 nan 0.1000 0.0092
## 9 1.0133 nan 0.1000 0.0103
## 10 0.9888 nan 0.1000 0.0117
## 20 0.8139 nan 0.1000 0.0052
## 40 0.6210 nan 0.1000 0.0012
## 60 0.5047 nan 0.1000 0.0025
## 80 0.4242 nan 0.1000 -0.0003
## 100 0.3645 nan 0.1000 -0.0001
## 120 0.3213 nan 0.1000 -0.0003
## 140 0.2868 nan 0.1000 -0.0001
## 150 0.2717 nan 0.1000 0.0009
##
## Iter TrainDeviance ValidDeviance StepSize Improve
## 1 1.2346 nan 0.1000 0.0402
## 2 1.1655 nan 0.1000 0.0312
## 3 1.1047 nan 0.1000 0.0270
## 4 1.0543 nan 0.1000 0.0231
## 5 1.0095 nan 0.1000 0.0168
## 6 0.9591 nan 0.1000 0.0199
## 7 0.9127 nan 0.1000 0.0197
## 8 0.8789 nan 0.1000 0.0138
## 9 0.8494 nan 0.1000 0.0115
## 10 0.8110 nan 0.1000 0.0158
## 20 0.5935 nan 0.1000 0.0071
## 40 0.3895 nan 0.1000 0.0026
## 60 0.2917 nan 0.1000 -0.0007
## 80 0.2227 nan 0.1000 -0.0000
## 100 0.1789 nan 0.1000 -0.0008
## 120 0.1440 nan 0.1000 -0.0009
## 140 0.1200 nan 0.1000 -0.0007
## 150 0.1110 nan 0.1000 -0.0007
##
## Iter TrainDeviance ValidDeviance StepSize Improve
## 1 1.1982 nan 0.1000 0.0547
## 2 1.1082 nan 0.1000 0.0450
## 3 1.0352 nan 0.1000 0.0332
## 4 0.9693 nan 0.1000 0.0270
## 5 0.9096 nan 0.1000 0.0279
## 6 0.8681 nan 0.1000 0.0158
## 7 0.8227 nan 0.1000 0.0201
## 8 0.7803 nan 0.1000 0.0195
## 9 0.7444 nan 0.1000 0.0124
## 10 0.7084 nan 0.1000 0.0132
## 20 0.4870 nan 0.1000 0.0040
## 40 0.2912 nan 0.1000 0.0009
## 60 0.1961 nan 0.1000 0.0000
## 80 0.1447 nan 0.1000 -0.0003
## 100 0.1092 nan 0.1000 0.0001
## 120 0.0816 nan 0.1000 -0.0010
## 140 0.0637 nan 0.1000 -0.0006
## 150 0.0560 nan 0.1000 -0.0003
##
## Iter TrainDeviance ValidDeviance StepSize Improve
## 1 1.2793 nan 0.1000 0.0199
## 2 1.2312 nan 0.1000 0.0231
## 3 1.2040 nan 0.1000 0.0120
## 4 1.1734 nan 0.1000 0.0110
## 5 1.1369 nan 0.1000 0.0166
## 6 1.1033 nan 0.1000 0.0169
## 7 1.0722 nan 0.1000 0.0115
## 8 1.0460 nan 0.1000 0.0086
## 9 1.0233 nan 0.1000 0.0099
## 10 1.0044 nan 0.1000 0.0069
## 20 0.8242 nan 0.1000 0.0061
## 40 0.6182 nan 0.1000 0.0024
## 60 0.5095 nan 0.1000 0.0017
## 80 0.4293 nan 0.1000 0.0005
## 100 0.3670 nan 0.1000 0.0002
## 120 0.3251 nan 0.1000 -0.0001
## 140 0.2831 nan 0.1000 0.0008
## 150 0.2654 nan 0.1000 -0.0006
##
## Iter TrainDeviance ValidDeviance StepSize Improve
## 1 1.2700 nan 0.1000 0.0203
## 2 1.1876 nan 0.1000 0.0369
## 3 1.1232 nan 0.1000 0.0304
## 4 1.0676 nan 0.1000 0.0247
## 5 1.0131 nan 0.1000 0.0261
## 6 0.9725 nan 0.1000 0.0153
## 7 0.9386 nan 0.1000 0.0125
## 8 0.8947 nan 0.1000 0.0203
## 9 0.8655 nan 0.1000 0.0129
## 10 0.8346 nan 0.1000 0.0122
## 20 0.6151 nan 0.1000 0.0076
## 40 0.3956 nan 0.1000 0.0032
## 60 0.2883 nan 0.1000 -0.0006
## 80 0.2239 nan 0.1000 -0.0016
## 100 0.1732 nan 0.1000 -0.0003
## 120 0.1385 nan 0.1000 -0.0002
## 140 0.1122 nan 0.1000 0.0001
## 150 0.1043 nan 0.1000 -0.0002
##
## Iter TrainDeviance ValidDeviance StepSize Improve
## 1 1.2175 nan 0.1000 0.0524
## 2 1.1258 nan 0.1000 0.0407
## 3 1.0568 nan 0.1000 0.0255
## 4 0.9894 nan 0.1000 0.0274
## 5 0.9413 nan 0.1000 0.0211
## 6 0.8886 nan 0.1000 0.0232
## 7 0.8464 nan 0.1000 0.0180
## 8 0.8098 nan 0.1000 0.0091
## 9 0.7671 nan 0.1000 0.0142
## 10 0.7319 nan 0.1000 0.0125
## 20 0.4906 nan 0.1000 0.0052
## 40 0.2873 nan 0.1000 0.0011
## 60 0.1951 nan 0.1000 0.0005
## 80 0.1416 nan 0.1000 -0.0001
## 100 0.1051 nan 0.1000 -0.0003
## 120 0.0788 nan 0.1000 -0.0002
## 140 0.0601 nan 0.1000 -0.0003
## 150 0.0542 nan 0.1000 -0.0004
##
## Iter TrainDeviance ValidDeviance StepSize Improve
## 1 1.2683 nan 0.1000 0.0236
## 2 1.2291 nan 0.1000 0.0202
## 3 1.1898 nan 0.1000 0.0171
## 4 1.1566 nan 0.1000 0.0126
## 5 1.1289 nan 0.1000 0.0107
## 6 1.0930 nan 0.1000 0.0129
## 7 1.0666 nan 0.1000 0.0098
## 8 1.0400 nan 0.1000 0.0101
## 9 1.0172 nan 0.1000 0.0109
## 10 0.9959 nan 0.1000 0.0064
## 20 0.8195 nan 0.1000 0.0026
## 40 0.6257 nan 0.1000 0.0018
## 60 0.5156 nan 0.1000 0.0005
## 80 0.4410 nan 0.1000 -0.0003
## 100 0.3767 nan 0.1000 0.0012
## 120 0.3277 nan 0.1000 0.0018
## 140 0.2906 nan 0.1000 -0.0001
## 150 0.2744 nan 0.1000 -0.0006
##
## Iter TrainDeviance ValidDeviance StepSize Improve
## 1 1.2354 nan 0.1000 0.0355
## 2 1.1748 nan 0.1000 0.0216
## 3 1.1260 nan 0.1000 0.0193
## 4 1.0656 nan 0.1000 0.0280
## 5 1.0110 nan 0.1000 0.0217
## 6 0.9645 nan 0.1000 0.0165
## 7 0.9282 nan 0.1000 0.0094
## 8 0.8930 nan 0.1000 0.0132
## 9 0.8542 nan 0.1000 0.0172
## 10 0.8183 nan 0.1000 0.0138
## 20 0.6136 nan 0.1000 0.0054
## 40 0.4019 nan 0.1000 0.0021
## 60 0.3002 nan 0.1000 0.0011
## 80 0.2364 nan 0.1000 -0.0009
## 100 0.1901 nan 0.1000 -0.0009
## 120 0.1590 nan 0.1000 -0.0010
## 140 0.1294 nan 0.1000 -0.0006
## 150 0.1207 nan 0.1000 -0.0014
##
## Iter TrainDeviance ValidDeviance StepSize Improve
## 1 1.2144 nan 0.1000 0.0459
## 2 1.1283 nan 0.1000 0.0365
## 3 1.0609 nan 0.1000 0.0265
## 4 0.9931 nan 0.1000 0.0306
## 5 0.9326 nan 0.1000 0.0284
## 6 0.8811 nan 0.1000 0.0195
## 7 0.8418 nan 0.1000 0.0160
## 8 0.7946 nan 0.1000 0.0185
## 9 0.7627 nan 0.1000 0.0102
## 10 0.7352 nan 0.1000 0.0085
## 20 0.5050 nan 0.1000 0.0045
## 40 0.3029 nan 0.1000 0.0009
## 60 0.2060 nan 0.1000 0.0001
## 80 0.1485 nan 0.1000 -0.0001
## 100 0.1139 nan 0.1000 -0.0003
## 120 0.0905 nan 0.1000 0.0002
## 140 0.0726 nan 0.1000 -0.0003
## 150 0.0635 nan 0.1000 0.0001
##
## Iter TrainDeviance ValidDeviance StepSize Improve
## 1 1.2713 nan 0.1000 0.0242
## 2 1.2267 nan 0.1000 0.0153
## 3 1.1874 nan 0.1000 0.0207
## 4 1.1535 nan 0.1000 0.0158
## 5 1.1295 nan 0.1000 0.0077
## 6 1.1077 nan 0.1000 0.0081
## 7 1.0769 nan 0.1000 0.0090
## 8 1.0444 nan 0.1000 0.0157
## 9 1.0193 nan 0.1000 0.0118
## 10 0.9990 nan 0.1000 0.0063
## 20 0.8281 nan 0.1000 0.0021
## 40 0.6347 nan 0.1000 0.0017
## 60 0.5199 nan 0.1000 0.0004
## 80 0.4398 nan 0.1000 0.0005
## 100 0.3820 nan 0.1000 0.0012
## 120 0.3339 nan 0.1000 0.0006
## 140 0.2996 nan 0.1000 -0.0008
## 150 0.2846 nan 0.1000 -0.0005
##
## Iter TrainDeviance ValidDeviance StepSize Improve
## 1 1.2267 nan 0.1000 0.0417
## 2 1.1542 nan 0.1000 0.0309
## 3 1.0947 nan 0.1000 0.0262
## 4 1.0480 nan 0.1000 0.0189
## 5 1.0000 nan 0.1000 0.0242
## 6 0.9543 nan 0.1000 0.0172
## 7 0.9229 nan 0.1000 0.0133
## 8 0.8856 nan 0.1000 0.0150
## 9 0.8506 nan 0.1000 0.0132
## 10 0.8243 nan 0.1000 0.0119
## 20 0.6074 nan 0.1000 0.0068
## 40 0.3989 nan 0.1000 0.0019
## 60 0.2936 nan 0.1000 0.0006
## 80 0.2271 nan 0.1000 0.0002
## 100 0.1812 nan 0.1000 0.0001
## 120 0.1477 nan 0.1000 -0.0002
## 140 0.1243 nan 0.1000 -0.0003
## 150 0.1130 nan 0.1000 -0.0005
##
## Iter TrainDeviance ValidDeviance StepSize Improve
## 1 1.2174 nan 0.1000 0.0372
## 2 1.1240 nan 0.1000 0.0426
## 3 1.0522 nan 0.1000 0.0325
## 4 0.9783 nan 0.1000 0.0365
## 5 0.9146 nan 0.1000 0.0285
## 6 0.8780 nan 0.1000 0.0130
## 7 0.8387 nan 0.1000 0.0147
## 8 0.7999 nan 0.1000 0.0158
## 9 0.7579 nan 0.1000 0.0185
## 10 0.7290 nan 0.1000 0.0088
## 20 0.4896 nan 0.1000 0.0081
## 40 0.2953 nan 0.1000 0.0003
## 60 0.2064 nan 0.1000 -0.0009
## 80 0.1515 nan 0.1000 -0.0003
## 100 0.1116 nan 0.1000 -0.0009
## 120 0.0852 nan 0.1000 -0.0003
## 140 0.0682 nan 0.1000 -0.0002
## 150 0.0602 nan 0.1000 0.0002
##
## Iter TrainDeviance ValidDeviance StepSize Improve
## 1 1.2685 nan 0.1000 0.0246
## 2 1.2236 nan 0.1000 0.0199
## 3 1.1833 nan 0.1000 0.0204
## 4 1.1487 nan 0.1000 0.0162
## 5 1.1206 nan 0.1000 0.0134
## 6 1.0951 nan 0.1000 0.0082
## 7 1.0691 nan 0.1000 0.0108
## 8 1.0392 nan 0.1000 0.0139
## 9 1.0150 nan 0.1000 0.0105
## 10 0.9936 nan 0.1000 0.0077
## 20 0.8225 nan 0.1000 0.0050
## 40 0.6322 nan 0.1000 0.0029
## 60 0.5167 nan 0.1000 0.0005
## 80 0.4363 nan 0.1000 0.0004
## 100 0.3842 nan 0.1000 0.0003
## 120 0.3411 nan 0.1000 -0.0005
## 140 0.3030 nan 0.1000 -0.0001
## 150 0.2854 nan 0.1000 -0.0001
gbm.cm <- confusionMatrix(gbm.ml)
cm_plot(gbm.ml, "Gradient Model") #### ---------------------># plot to use in report
gbm.metrics <- data.frame (
"Model" = "Gradient Boosting",
"Accuracy" = (gbm.cm$table[1,1] + gbm.cm$table[2,2])/100,
"Recall" = gbm.cm$table[2,2] / (gbm.cm$table[2,2] + gbm.cm$table[1,2]),
"Precision" = gbm.cm$table[2,2] / (gbm.cm$table[2,1] + gbm.cm$table[2,2]),
"FNR" = (gbm.cm$table[1,2] / (gbm.cm$table[2,2] + gbm.cm$table[1,2])),
"Fscore" = (2 * gbm.cm$table[2,2]) / (2 * gbm.cm$table[2,2] + gbm.cm$table[1,2] + gbm.cm$table[2,1])
)
gbm.metrics
## Model Accuracy Recall Precision FNR Fscore
## 1 Gradient Boosting 0.9348371 0.8657718 0.9555556 0.1342282 0.9084507
logit.pred <-logit.ml %>% predict(test_set_pca)
logit.matrix <- confusionMatrix(logit.pred, test_set_pca$diagnosis)
logit.matrix
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 102 1
## 1 5 62
##
## Accuracy : 0.9647
## 95% CI : (0.9248, 0.9869)
## No Information Rate : 0.6294
## P-Value [Acc > NIR] : <2e-16
##
## Kappa : 0.9253
##
## Mcnemar's Test P-Value : 0.2207
##
## Sensitivity : 0.9533
## Specificity : 0.9841
## Pos Pred Value : 0.9903
## Neg Pred Value : 0.9254
## Prevalence : 0.6294
## Detection Rate : 0.6000
## Detection Prevalence : 0.6059
## Balanced Accuracy : 0.9687
##
## 'Positive' Class : 0
##
rfPCA.pred <-rfPCA.ml %>% predict(test_set_pca)
rfPCA.matrix <- confusionMatrix(rfPCA.pred, test_set_pca$diagnosis)
rfPCA.matrix
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 101 7
## 1 6 56
##
## Accuracy : 0.9235
## 95% CI : (0.8728, 0.9587)
## No Information Rate : 0.6294
## P-Value [Acc > NIR] : <2e-16
##
## Kappa : 0.8355
##
## Mcnemar's Test P-Value : 1
##
## Sensitivity : 0.9439
## Specificity : 0.8889
## Pos Pred Value : 0.9352
## Neg Pred Value : 0.9032
## Prevalence : 0.6294
## Detection Rate : 0.5941
## Detection Prevalence : 0.6353
## Balanced Accuracy : 0.9164
##
## 'Positive' Class : 0
##
rf.pred <-rf.ml %>% predict(test_set)
rf.matrix <- confusionMatrix(rf.pred, test_set_pca$diagnosis)
rf.matrix
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 102 4
## 1 5 59
##
## Accuracy : 0.9471
## 95% CI : (0.9019, 0.9755)
## No Information Rate : 0.6294
## P-Value [Acc > NIR] : <2e-16
##
## Kappa : 0.8869
##
## Mcnemar's Test P-Value : 1
##
## Sensitivity : 0.9533
## Specificity : 0.9365
## Pos Pred Value : 0.9623
## Neg Pred Value : 0.9219
## Prevalence : 0.6294
## Detection Rate : 0.6000
## Detection Prevalence : 0.6235
## Balanced Accuracy : 0.9449
##
## 'Positive' Class : 0
##
knn.pred <-knn.ml %>% predict(test_set_pca)
knn.matrix <- confusionMatrix(knn.pred, test_set_pca$diagnosis)
knn.matrix
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 105 10
## 1 2 53
##
## Accuracy : 0.9294
## 95% CI : (0.8799, 0.963)
## No Information Rate : 0.6294
## P-Value [Acc > NIR] : < 2e-16
##
## Kappa : 0.8446
##
## Mcnemar's Test P-Value : 0.04331
##
## Sensitivity : 0.9813
## Specificity : 0.8413
## Pos Pred Value : 0.9130
## Neg Pred Value : 0.9636
## Prevalence : 0.6294
## Detection Rate : 0.6176
## Detection Prevalence : 0.6765
## Balanced Accuracy : 0.9113
##
## 'Positive' Class : 0
##
gbm.pred <-gbm.ml %>% predict(test_set_pca)
gbm.matrix <- confusionMatrix(gbm.pred, test_set_pca$diagnosis)
gbm.matrix
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 104 5
## 1 3 58
##
## Accuracy : 0.9529
## 95% CI : (0.9094, 0.9795)
## No Information Rate : 0.6294
## P-Value [Acc > NIR] : <2e-16
##
## Kappa : 0.8985
##
## Mcnemar's Test P-Value : 0.7237
##
## Sensitivity : 0.9720
## Specificity : 0.9206
## Pos Pred Value : 0.9541
## Neg Pred Value : 0.9508
## Prevalence : 0.6294
## Detection Rate : 0.6118
## Detection Prevalence : 0.6412
## Balanced Accuracy : 0.9463
##
## 'Positive' Class : 0
##
#library(data.table)
library(formattable)
#Logic Regression
logit.1 <- as.data.frame(logit.matrix$overall["Accuracy"])
colnames(logit.1)<-""
logit.2 <- as.data.frame(logit.matrix$byClass[1:4])
colnames(logit.2)<-""
logit<-rbind(logit.1, logit.2)
colnames(logit)<-"Logit PCA"
#RF
rfPCA.1 <- as.data.frame(rfPCA.matrix$overall["Accuracy"])
colnames(rfPCA.1)<-""
rfPCA.2 <- as.data.frame(rfPCA.matrix$byClass[1:4])
colnames(rfPCA.2)<-""
rfPCA<-rbind(rfPCA.1, rfPCA.2)
colnames(rfPCA)<-"Random Forest PCA"
row.names(rfPCA)<-c()
#RF
rf.1 <- as.data.frame(rf.matrix$overall["Accuracy"])
colnames(rf.1)<-""
rf.2 <- as.data.frame(rf.matrix$byClass[1:4])
colnames(rf.2)<-""
rf<-rbind(rf.1, rf.2)
colnames(rf)<-"Random Forest without PCA"
row.names(rf)<-c()
#KNN
knn.1 <- as.data.frame(knn.matrix$overall["Accuracy"])
colnames(knn.1)<-""
knn.2 <- as.data.frame(knn.matrix$byClass[1:4])
colnames(knn.2)<-""
knn<-rbind(knn.1, knn.2)
colnames(knn)<-"KNN PCA"
row.names(knn)<-c()
#GBM
gbm.1 <- as.data.frame(gbm.matrix$overall["Accuracy"])
colnames(gbm.1)<-""
gbm.2 <- as.data.frame(gbm.matrix$byClass[1:4])
colnames(gbm.2)<-""
gbm<-rbind(gbm.1, gbm.2)
colnames(gbm)<-"GBM PCA"
row.names(gbm)<-c()
final<-as.data.frame(t(cbind(logit,rfPCA, rf,knn,gbm)))
formattable(final)
| Accuracy | Sensitivity | Specificity | Pos Pred Value | Neg Pred Value | |
|---|---|---|---|---|---|
| Logit PCA | 0.9647059 | 0.9532710 | 0.9841270 | 0.9902913 | 0.9253731 |
| Random Forest PCA | 0.9235294 | 0.9439252 | 0.8888889 | 0.9351852 | 0.9032258 |
| Random Forest without PCA | 0.9470588 | 0.9532710 | 0.9365079 | 0.9622642 | 0.9218750 |
| KNN PCA | 0.9294118 | 0.9813084 | 0.8412698 | 0.9130435 | 0.9636364 |
| GBM PCA | 0.9529412 | 0.9719626 | 0.9206349 | 0.9541284 | 0.9508197 |